# -*- coding: utf-8 -*-
'''
Created on Sep 7, 2012

@author: LONG HOANG GIANG
'''
import sys
import os
sys.path.append(os.path.expanduser('/home5/vietcntt/longhoanggiang/python'))
sys.path.append(os.path.join(os.path.dirname(__file__), '../'))
from CrawlerLib import Http, commonlib
from lxml import etree
import datetime
from urlparse import urljoin


def processAChapter(url):
    print '> process url: {0}'.format(url)
    tree = Http.getXMLTree(url)
    if tree == None: raise
    contentNode = tree.xpath("//div[contains(@class, 'field field-name-body')]")
    if len(contentNode) == 0: raise
    return "<br /><br /><hr /><br /></br />" + etree.tounicode(contentNode[0]).encode('utf-8') + "<br /><br /><hr /><br /></br />"

def process(url):
    tree = Http.getXMLTree(url)
    if tree == None: return
    contentNode = tree.xpath("//div[@class='field-item even']")
    if len(contentNode) == 0: raise
    htmlPages = '''<html><head><meta charset="utf-8" /></head><body>'''
    html = etree.tounicode(contentNode[0]).encode('utf-8')
    htmlPages += html
    for item in tree.xpath("//div[contains(@id, 'book-navigation-')]/ul/li/a"):
        href = item.get('href').strip()
        if href == '': continue
        href = urljoin('http://alobooks.vn', href)
        htmlPages += processAChapter(href)
    htmlPages += '''</body></html>'''
    commonlib.file_put_contents(FOLDER + 'alobooks.html', htmlPages)
    return
    
    
if __name__ == '__main__':
    
    FOLDER = 'c:/Users/LONG HOANG GIANG/Desktop/'
    ############################################
    url = 'http://alobooks.vn/forum/66/sam-sam-den-day-an-ne-co-man-full-ngoai.aspx'
    ############################################
    process(url)
    print '> Finished at {0}'.format(datetime.datetime.now())
    os._exit(1)
    
    
    
